Exemplo n.º 1
0
def quantize_onnx(onnx_path, quantized_onnx_path):
    import onnx
    from onnxruntime.quantization import QuantizationMode, quantize

    onnx_model = onnx.load(onnx_path)

    quantized_model = quantize(
        model=onnx_model,
        quantization_mode=QuantizationMode.IntegerOps,
        force_fusions=True,
        symmetric_weight=True,
    )

    onnx.save_model(quantized_model, quantized_onnx_path)
Exemplo n.º 2
0
def quantize_onnx(onnx_path, quantized_onnx_path):
    """
    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU.
    """
    import onnx
    from onnxruntime.quantization import QuantizationMode, quantize

    onnx_model = onnx.load(onnx_path)

    quantized_model = quantize(
        model=onnx_model,
        quantization_mode=QuantizationMode.IntegerOps,
        force_fusions=True,
        symmetric_weight=True,
    )

    # Save model
    onnx.save_model(quantized_model, quantized_onnx_path)
Exemplo n.º 3
0
def quantize_onnx(onnx_path, quantized_onnx_path):
    """
    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU.
    """
    import onnx
    from onnxruntime.quantization import QuantizationMode, quantize

    onnx_model = onnx.load(onnx_path)

    # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime
    print(
        "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n"
        "This limitation will be removed in the next release of onnxruntime.")

    quantized_model = quantize(
        model=onnx_model,
        quantization_mode=QuantizationMode.IntegerOps,
        force_fusions=True,
        symmetric_weight=True,
    )

    # Save model
    onnx.save_model(quantized_model, quantized_onnx_path)
Exemplo n.º 4
0
def quantize(onnx_model_path: Path) -> Path:
    """
    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU.
    Args:
        onnx_model_path: Path to location the exported ONNX model is stored

    Returns: The Path generated for the quantized
    """
    import onnx
    from onnxruntime.quantization import QuantizationMode, quantize

    onnx_model = onnx.load(onnx_model_path.as_posix())

    # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime
    print(
        "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n"
        "This limitation will be removed in the next release of onnxruntime."
    )

    quantized_model = quantize(
        model=onnx_model,
        quantization_mode=QuantizationMode.IntegerOps,
        force_fusions=True,
        symmetric_weight=True,
    )

    # Append "-quantized" at the end of the model's name
    quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized")

    # Save model
    print(
        f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}"
    )
    onnx.save_model(quantized_model, quantized_model_path.as_posix())

    return quantized_model_path
dummy_input = torch.randn(1, 3, 240, 320).to("cpu")
# dummy_input = torch.randn(1, 3, 480, 640).to("cuda") #if input size is 640*480
torch.onnx.export(net,
                  dummy_input,
                  model_path,
                  verbose=False,
                  input_names=['input'],
                  output_names=['scores', 'boxes'],
                  opset_version=11)

model = onnx.load(model_path)

optimized_model = optimizer.optimize_model(model_path,
                                           model_type='bert',
                                           num_heads=12,
                                           hidden_size=768)
optimized_onnx_model_path = f"models/onnx/{model_name}_optimized.onnx"
optimized_model.save_model_to_file(optimized_onnx_model_path)
print('Optimized model saved at :', optimized_onnx_model_path)
print('>> quantizing..')
model = onnx.load(model_path)
quantized_model = quantize(model=model,
                           quantization_mode=QuantizationMode.IntegerOps,
                           force_fusions=True,
                           symmetric_weight=True)
optimized_quantized_onnx_model_path = f"models/onnx/{model_name}_ONNXquantized.onnx"
onnx.save_model(quantized_model, optimized_quantized_onnx_model_path)
print('Quantized&optimized model saved at :',
      optimized_quantized_onnx_model_path)